Introduction

subdiagnosis <- readr::read_tsv(
  file.path("..", "..", "..", "data", "current", params$scpca_project_id, "single_cell_metadata.tsv"),
  show_col_types = FALSE
  ) |>
  dplyr::filter(scpca_sample_id == params$sample_id) |>
  dplyr::pull(subdiagnosis)

This notebook explores using infercnv to estimate tumor and normal cells in SCPCS000184 from SCPCP000006. This sample has a(n) Anaplastic subdiagnosis.

infercnv was run using the 06_inferCNV.R script with and without a normal reference. We also tested the impact of the subselection of normal cells using either immune, and/or endothelial cells as healthy reference.

In this notebook, we just want to compare the heatmaps of CNV profiles, and evaluate how comparable are the methods and how sensible they are to key parameters such as selection of healthy reference.

Libraries

library(infercnv)
library(SCpubr)
library(ggplot2)
library(patchwork)
library(Seurat)

Functions

Here we defined function that will be used multiple time all along the notebook.

Visualize CNV grouped by clusters or other metadata

For a Seurat object object, the function Do_CNV_heatmap load the infercnv object created with the script 06_infercnv.R using reference_value as a reference and call the function SCpubr::do_CopyNumberVariantPlot to plot the mean CNV score in each group defined in group.by.

  • object is the Seurat object

  • infercnv_obj is the infercnv object

  • group.by is the metadata used for grouping the violin plots

Do_CNV_heatmap <- function(object, infercnv_obj, group.by){
  

  chromosome_locations = SCpubr::human_chr_locations

  out <- SCpubr::do_CopyNumberVariantPlot(sample = object,
                                        infercnv_object = infercnv_obj,
                                        using_metacells = FALSE,
                                        chromosome_locations = chromosome_locations,
                                        return_object = FALSE,
                                        group.by = group.by
                                        )
  out <- out + 
    ggtitle(glue::glue("Copy Number Variant Plot, ", reference_value)) +
    ylab(label = "")
  
  return(out)
}

Calculate a global CNV score per cell to check the general distribution

For a Seurat object an infercnv object created with the script 06_infercnv.R using reference_value as a reference, the function Do_CNV_score load the infercnv object and calculate a CNV score per cell. The score is calculated based on the biostar discussion. It

  • reference_value is the selection of normal cells used for infercnv
Do_CNV_score <- function(seurat_oject,infercnv_obj,reference_value){
  
  score <- apply(infercnv_obj@expr.data,2,function(x){ sum(x < 0.90 | x > 1.10)/length(x) })
  
  seurat_obj <- AddMetaData(seurat_oject, score, col.name = glue::glue("CNV-score_", reference_value))
  return(seurat_obj)
}

Visualize seurat clusters and metadata

For a Seurat object objectand a metadata metadata, the function visualize_metadata will plot FeaturePlot and BarPlot

  • object is the Seurat object

  • meta the gene or quantitative value to be plotted

  • group.by is the metadata used for grouping the violin plots

visualize_metadata <- function(object, meta, group.by){
  
  if(is.numeric(object@meta.data[,meta])){
     d <- SCpubr::do_FeaturePlot(object, 
                                              features = meta, 
                                              pt.size = 0.2, 
                                              legend.width = 0.5, 
                                              legend.length = 5, 
                                              legend.position = "right") + ggtitle(meta)
    b <- SCpubr::do_ViolinPlot(object, 
                                             features = meta, 
                                             ncol = 1, 
                                             group.by = group.by, 
                                             legend.position = "none")
                   
   return(d + b + plot_layout(ncol = 2, widths = c(2,4))) 
  }
  
  
  else{
    
  
  d <- SCpubr::do_DimPlot(object, reduction="umap", group.by = group.by, label = TRUE, repel = TRUE) + ggtitle(paste0(meta," - umap")) + theme(text=element_text(size=18))
  b <- SCpubr::do_BarPlot(sample = object,
                         group.by = meta,
                         split.by = group.by,
                         position = "fill",
                         font.size = 10,
                         legend.ncol = 3) +
                         ggtitle("% cells")+
                         xlab(print(group.by)) +
                         theme(text=element_text(size=18))
  return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
  }
  
}

Visualize seurat clusters and markers genes

For a Seurat object objectand a features features, the function visualize_feature will plot FeaturePlot and ViolinPlot

  • object is the Seurat object

  • features the gene or quantitative value to be plotted

  • group.by is the metadata used for grouping the violin plots

visualize_feature <- function(object, features, group.by ){
 
                  d <- SCpubr::do_FeaturePlot(object, 
                                              features = features, 
                                              pt.size = 0.2, 
                                              legend.width = 0.5, 
                                              legend.length = 5, 
                                              legend.position = "right") + ggtitle(features)
                  b <- SCpubr::do_ViolinPlot(object, 
                                             features = features, 
                                             ncol = 1, 
                                             group.by = group.by, 
                                             legend.position = "none",
                                             assay = "SCT") + ylab(features)
                   
                  return(d + b + plot_layout(ncol = 2, widths = c(2,4))) 
}

Visualize CNV density

For a Seurat object objectand a features features, the function visualize_feature will plot FeaturePlot and ViolinPlot

  • object is the Seurat object

  • features the gene or quantitative value to be plotted

  • group.by is the metadata used for grouping the violin plots

visualize_density <- function(object, features, group.by ){
 
                  d <- SCpubr::do_RidgePlot(object, 
                                            feature = features, 
                                            group.by = group.by, 
                                             legend.position = "none",
                                             assay = "SCT") + ylab(features)
                   
                  return(d) 
}

Base directories

# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")

Input files

The input for this notebook are the results of 06_inferCNV.R

result_dir <- file.path(module_base, "results", params$sample_id, "06_infercnv")

Seurat object

We load the Seurat object generated in 06_infercnv.R

srat <- readRDS(file.path(module_base, "results", params$sample_id, glue::glue("02b-fetal_kidney_label-transfer_", params$sample_id, ".Rds")))

Analysis

Heatmap of infercnv results

Here we plot the output of infercnv as heatmaps of CNV. We first look at the png file generated by the infercnv function. We then used the infercnv object to look at mean CNV value across compartments (immune, endothelial, stroma and fetal nephron).

Without reference

With immune cells as reference

With endothelium cells as reference

With immune and endothelium cells as reference

Summarize CNV per chromosome and compartment

infercnv_obj <- list()
for(reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")){
  infercnv_obj[[reference_value]] <- readRDS(file.path(result_dir, glue::glue(reference_value, "_HMM-no") , glue::glue("06_infercnv_", params$sample_id, "_", reference_value, "_HMM-no.rds")))
  print(Do_CNV_heatmap(object = srat, infercnv_obj = infercnv_obj[[reference_value]], group.by = "fetal_kidney_predicted.compartment"))
}

Try to calculate single CNV score and see if it can be use to define cells with CNV

for(reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")){
  srat <- Do_CNV_score(srat,infercnv_obj = infercnv_obj[[reference_value]], reference_value)
  p1 <- visualize_feature(srat, glue::glue("CNV-score_", reference_value) , group.by = "fetal_kidney_predicted.compartment" )
  p2 <- visualize_density(srat, features=glue::glue("CNV-score_", reference_value), group.by = "fetal_kidney_predicted.compartment")
  print(p1 + p2 + plot_layout(ncol = 3, widths = c(1,2,2)))
}

This unique CNV score do not look so promising. We might have to select chromosomes we would like to look at, i.e. the one relevant for Wilms tumor.

Explore infercnv results generated with immune and endothelial cells as reference, using a HMM i3 model

Load the Seurat object

We load the Seurat object generated in 06_infercnv.R

srat <- readRDS(file.path(module_base, "results", params$sample_id, glue::glue("06_infercnv_HMM-i3_", params$sample_id, "_reference-both.rds")))

Feature plot and repartition of the CNV per chromosome

For each chromosome, we look at the repartition of the proportion_cnv_ in cells labeled as immune, endothelial, stroma and fetal nephron. proportion_cnv_ is the proportion in number of genes that are part of any cnv/loss/duplication in the given chr.

This should allow a quick visual check that immune and endothelial cells do not expressed any CNV. If so, we should check if genes driving the CNV score are not key immune or endothelium genes.

Also, we should visualize if one area of the umap do not have CNV and could be identify as additional normal cell subset.

for(i in 1:22){
  print(visualize_feature(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}

Distribution of CNV estimation in the Wilms tumor copartments

For each chromosome, we look at the distribution of the proportion_cnv_ in cells labeled as immune, endothelial, stroma and fetal nephron. proportion_cnv_ is the proportion in number of genes that are part of any cnv/loss/duplication in the given chr.

We are quite confident that immune and endothelial cells are well identified by label transfer done in 02b_label-transfer_fetal_kidney_reference_Stewart.Rmd. The distribution of CNV for endothelial and immune cells should thus be a single peak center on 0.

We do not know if fetal nephron and stroma cells are a mix of normal and cancer cells. Would they be a group of normal cells, we should expect a single peak center on 0 for every chromosome. As we expect to have a large number of cancer with heterogeneous CNV, we should see multiple peaks.

for(i in 1:22){
  print(visualize_density(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}

Explore infercnv results generated with immune and endothelial cells as reference, using a HMM i6 model

Load the Seurat object

We load the Seurat object generated in 06_infercnv.R

srat <- readRDS(file.path(module_base, "results", params$sample_id, glue::glue("06_infercnv_HMM-i6_", params$sample_id, "_reference-both.rds")))

Feature plot and repartition of the CNV per chromosome

For each chromosome, we look at the repartition of the proportion_cnv_ in cells labeled as immune, endothelial, stroma and fetal nephron. proportion_cnv_ is the proportion in number of genes that are part of any cnv/loss/duplication in the given chr.

This should allow a quick visual check that immune and endothelial cells do not expressed any CNV. If so, we should check if genes driving the CNV score are not key immune or endothelium genes.

Also, we should visualize if one area of the umap do not have CNV and could be identify as additional normal cell subset.

for(i in 1:22){
  print(visualize_feature(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}

Distribution of CNV estimation in the Wilms tumor copartments

For each chromosome, we look at the distribution of the proportion_cnv_ in cells labeled as immune, endothelial, stroma and fetal nephron. proportion_cnv_ is the proportion in number of genes that are part of any cnv/loss/duplication in the given chr.

We are quite confident that immune and endothelial cells are well identified by label transfer done in 02b_label-transfer_fetal_kidney_reference_Stewart.Rmd. The distribution of CNV for endothelial and immune cells should thus be a single peak center on 0.

We do not know if fetal nephron and stroma cells are a mix of normal and cancer cells. Would they be a group of normal cells, we should expect a single peak center on 0 for every chromosome. As we expect to have a large number of cancer with heterogeneous CNV, we should see multiple peaks.

for(i in 1:22){
  print(visualize_density(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}

Conclusions

  • Providing no reference is not a good option, as we think that most of the cells are cancer cells with few CNV. Without healthy reference, infercnv take the mean of expression across the cells as a reference. In case of events that are well shared by a majority of the cells, they will be smoothed.

  • For some CNV and specific chromosome, the choice a the reference do not affect the result: ch18, ch17.

  • Some CNV values differs depending on the choice of the reference, for chr5 for example.

This is an issue if we want to have the exact CNV pattern.

There are three samples with a very low (<5) number of immune + endothelial cells: SCPCS000197 (4 endothelial), SCPCS000190 (1 immune), SCPCS000177 (2 immune) for which it might be problematic to run infercnv.

I am not sure if some endothelial cells might be missclassified? To be careful when we look at % of alterations within a compartment, as there are sometimes very few immune and/or endothelial cells, some % can be easily high without being meaningful.

HMM-i3 model looks “cleaner” than the HMM-i6 model, with less CNV found in immune and endothelial cells.

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Vienna
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] optparse_1.7.5     Seurat_5.1.0       SeuratObject_5.0.2 sp_2.1-4           patchwork_1.2.0    ggplot2_3.5.1      SCpubr_2.0.2       infercnv_1.20.0   
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.4                    matrixStats_1.3.0           spatstat.sparse_3.1-0       bitops_1.0-8                httr_1.4.7                  RColorBrewer_1.1-3         
##   [7] doParallel_1.0.17           tools_4.4.1                 sctransform_0.4.1           utf8_1.2.4                  R6_2.5.1                    DT_0.33                    
##  [13] lazyeval_0.2.2              uwot_0.2.2                  ggdist_3.3.2                withr_3.0.1                 gridExtra_2.3               parallelDist_0.2.6         
##  [19] progressr_0.14.0            argparse_2.2.3              cli_3.6.3                   Biobase_2.64.0              formatR_1.14                spatstat.explore_3.3-2     
##  [25] fastDummies_1.7.4           sandwich_3.1-1              sass_0.4.9                  labeling_0.4.3              mvtnorm_1.3-1               spatstat.data_3.1-2        
##  [31] readr_2.1.5                 ggridges_0.5.6              pbapply_1.7-2               yulab.utils_0.1.7           parallelly_1.38.0           limma_3.60.4               
##  [37] rstudioapi_0.16.0           RSQLite_2.3.7               gridGraphics_0.5-1          generics_0.1.3              gtools_3.9.5                ica_1.0-3                  
##  [43] spatstat.random_3.3-1       vroom_1.6.5                 dplyr_1.1.4                 distributional_0.5.0        Matrix_1.7-0                futile.logger_1.4.3        
##  [49] fansi_1.0.6                 S4Vectors_0.42.1            abind_1.4-5                 lifecycle_1.0.4             multcomp_1.4-26             yaml_2.3.10                
##  [55] edgeR_4.2.1                 SummarizedExperiment_1.34.0 gplots_3.1.3.1              SparseArray_1.4.8           Rtsne_0.17                  grid_4.4.1                 
##  [61] blob_1.2.4                  promises_1.3.0              crayon_1.5.3                miniUI_0.1.1.1              lattice_0.22-6              cowplot_1.1.3              
##  [67] KEGGREST_1.44.1             pillar_1.9.0                knitr_1.48                  GenomicRanges_1.56.1        future.apply_1.11.2         codetools_0.2-20           
##  [73] leiden_0.4.3.1              glue_1.7.0                  spatstat.univar_3.0-0       data.table_1.16.0           vctrs_0.6.5                 png_0.1-8                  
##  [79] spam_2.10-0                 gtable_0.3.5                assertthat_0.2.1            cachem_1.1.0                xfun_0.47                   S4Arrays_1.4.1             
##  [85] mime_0.12                   libcoin_1.0-10              coda_0.19-4.1               survival_3.7-0              DElegate_1.2.1              SingleCellExperiment_1.26.0
##  [91] iterators_1.0.14            statmod_1.5.0               fitdistrplus_1.2-1          TH.data_1.1-2               ROCR_1.0-11                 nlme_3.1-166               
##  [97] bit64_4.0.5                 RcppAnnoy_0.0.22            GenomeInfoDb_1.40.1         rprojroot_2.0.4             bslib_0.8.0                 irlba_2.3.5.1              
## [103] KernSmooth_2.23-24          colorspace_2.1-1            BiocGenerics_0.50.0         DBI_1.2.3                   tidyselect_1.2.1            bit_4.0.5                  
## [109] compiler_4.4.1              DelayedArray_0.30.1         plotly_4.10.4               scales_1.3.0                caTools_1.18.2              lmtest_0.9-40              
## [115] stringr_1.5.1               digest_0.6.37               goftest_1.2-3               spatstat.utils_3.1-0        rmarkdown_2.28              XVector_0.44.0             
## [121] htmltools_0.5.8.1           pkgconfig_2.0.3             MatrixGenerics_1.16.0       highr_0.11                  fastmap_1.2.0               rlang_1.1.4                
## [127] htmlwidgets_1.6.4           UCSC.utils_1.0.0            shiny_1.9.1                 jquerylib_0.1.4             farver_2.1.2                zoo_1.8-12                 
## [133] jsonlite_1.8.8              magrittr_2.0.3              ggplotify_0.1.2             modeltools_0.2-23           GenomeInfoDbData_1.2.12     dotCall64_1.1-1            
## [139] munsell_0.5.1               Rcpp_1.0.13                 ape_5.8                     viridis_0.6.5               reticulate_1.38.0           stringi_1.8.4              
## [145] zlibbioc_1.50.0             MASS_7.3-61                 plyr_1.8.9                  parallel_4.4.1              listenv_0.9.1               ggrepel_0.9.5              
## [151] forcats_1.0.0               deldir_2.0-4                Biostrings_2.72.1           splines_4.4.1               tensor_1.5                  hms_1.1.3                  
## [157] locfit_1.5-9.10             igraph_2.0.3                fastcluster_1.2.6           spatstat.geom_3.3-2         RcppHNSW_0.6.0              reshape2_1.4.4             
## [163] stats4_4.4.1                futile.options_1.0.1        evaluate_0.24.0             RcppParallel_5.1.9          lambda.r_1.2.4              renv_1.0.7                 
## [169] BiocManager_1.30.25         tzdb_0.4.0                  phyclust_0.1-34             foreach_1.5.2               httpuv_1.6.15               getopt_1.20.4              
## [175] RANN_2.6.2                  tidyr_1.3.1                 purrr_1.0.2                 polyclip_1.10-7             future_1.34.0               scattermore_1.2            
## [181] coin_1.4-3                  xtable_1.8-4                RSpectra_0.16-2             later_1.3.2                 viridisLite_0.4.2           rjags_4-16                 
## [187] tibble_3.2.1                memoise_2.0.1               AnnotationDbi_1.66.0        IRanges_2.38.1              cluster_2.1.6               globals_0.16.3